In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy
from numpy import corrcoef, sum, log, arange, exp, isnan
import csv
import nltk

titanic = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Training set : %i (%0.2f) , Test set : %i (%0.2f)" % (len(titanic),float(len(titanic.dropna()))/len(titanic)*100,len(test),float(len(test.dropna()))/len(test)*100))
print("Features : %i" % len(titanic.columns))
for i in titanic.columns:
    print(i,len(titanic[titanic[i].notnull()]),get_my_column_type(titanic[i]),len(titanic[i].unique()))
print(", ".join(titanic.columns))


Training set : 891 (20.54) , Test set : 418 (20.81)
Features : 12
('PassengerId', 891, <type 'numpy.int64'>, 891)
('Survived', 891, <type 'numpy.int64'>, 2)
('Pclass', 891, <type 'numpy.int64'>, 3)
('Name', 891, <type 'str'>, 891)
('Sex', 891, <type 'str'>, 2)
('Age', 714, <type 'numpy.float64'>, 89)
('SibSp', 891, <type 'numpy.int64'>, 7)
('Parch', 891, <type 'numpy.int64'>, 7)
('Ticket', 891, <type 'str'>, 681)
('Fare', 891, <type 'numpy.float64'>, 248)
('Cabin', 204, <type 'str'>, 148)
('Embarked', 889, <type 'str'>, 4)
PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
Create the title class.

In [138]:
titles = {'Mr.':'Mr.', 'Miss.':'Miss.','Mrs.':'Mrs.','Master':'Master',
          'Ms.':'Mrs.', 'Mme.':'Mrs.', 'Countess.':'Mrs.',
          'Miss.':'Miss.','Mlle.':'Miss.', 
          'Don.':'Mr.','Rev.':'Mr.','Dr.':'Mr.', 'Major.':'Mr.', 'Col.':'Mr.', 'Capt.':'Mr.', 'Jonkheer.':'Mr.'}
titanic["title"] = ''
test["title"] = ''
for title in titles:
    titanic.loc[(titanic.Name.str.contains(title))&(titanic.title == ''),'title'] = titles[title]
    test.loc[(test.Name.str.contains(title))&(test.title == ''),'title'] = titles[title]

In [178]:
ticket_list = {}
titanic["on_the_ticket"] = 0
test["on_the_ticket"] = 0

for the_ticket in titanic.Ticket.dropna():
    if the_ticket in ticket_list:
        ticket_list[the_ticket] += 1
    else:
        ticket_list[the_ticket] = 0
for ticket in ticket_list:
    titanic.loc[(titanic.Ticket==ticket)&(titanic.SibSp==0)&(titanic.SibSp==0)&(titanic.Parch==0),"on_the_ticket"] = ticket_list[ticket]
    test.loc[(test.Ticket==ticket)&(test.SibSp==0)&(test.Parch==0),"on_the_ticket"] = ticket_list[ticket]

titanic["total_relatives"] = titanic["on_the_ticket"] + titanic["SibSp"] + titanic["Parch"]
test["total_relatives"] = test["on_the_ticket"] + test["SibSp"] + test["Parch"]
titanic["in_a_group"] = (titanic["total_relatives"]>0)
test["in_a_group"] = (test["total_relatives"]>0)

In [200]:
death_counts = pd.crosstab([titanic["title"],titanic["Pclass"]], titanic["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])


Out[200]:
<matplotlib.axes._subplots.AxesSubplot at 0x11811af10>
Add Bag of words for Name.

In [71]:
bag_of_words = {}
for the_name in titanic.Name.dropna():
    for token in nltk.tokenize.word_tokenize(the_name):
        if token in bag_of_words:
            bag_of_words[token] += 1
        else:
            bag_of_words[token] = 1
sorted([(x,bag_of_words[x]) for x in bag_of_words if len(x)>2],key=lambda x:x[1],reverse=True)[0:15]


Out[71]:
[('Mr.', 516),
 ('Miss', 182),
 ('Mrs.', 125),
 ('William', 64),
 ('John', 44),
 ('Master', 40),
 ('Henry', 35),
 ('James', 24),
 ('George', 24),
 ('Charles', 23),
 ('Thomas', 22),
 ('Mary', 20),
 ('Edward', 18),
 ('Anna', 17),
 ('Joseph', 16)]

In [198]:
the_type = "Mr."
death_counts = pd.crosstab([titanic[titanic['title']==the_type]["Pclass"],titanic[titanic['title']==the_type]["Age"]//10], titanic[titanic['title']==the_type]["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])


Out[198]:
<matplotlib.axes._subplots.AxesSubplot at 0x117a2a150>

In [137]:
titanic.loc[(titanic.Ticket.str.contains("237736"))]


Out[137]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked title
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C Mrs.
122 123 0 2 Nasser, Mr. Nicholas male 32.5 1 0 237736 30.0708 NaN C Mr.

In [192]:
titanic[(titanic.Pclass==3)&(titanic.Cabin.notnull())]


Out[192]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked title on_the_ticket total_relatives in_a_group
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4 1 1 PP 9549 16.7000 G6 S Miss. 0 2 True
75 76 0 3 Moen, Mr. Sigurd Hansen male 25 0 0 348123 7.6500 F G73 S Mr. 0 0 False
128 129 1 3 Peter, Miss. Anna female NaN 1 1 2668 22.3583 F E69 C Miss. 0 2 True
205 206 0 3 Strom, Miss. Telma Matilda female 2 0 1 347054 10.4625 G6 S Miss. 0 1 True
251 252 0 3 Strom, Mrs. Wilhelm (Elna Matilda Persson) female 29 1 1 347054 10.4625 G6 S Mrs. 0 2 True
394 395 1 3 Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt... female 24 0 2 PP 9549 16.7000 G6 S Mrs. 0 2 True
429 430 1 3 Pickard, Mr. Berk (Berk Trembisky) male 32 0 0 SOTON/O.Q. 392078 8.0500 E10 S Mr. 0 0 False
699 700 0 3 Humblen, Mr. Adolf Mathias Nicolai Olsen male 42 0 0 348121 7.6500 F G63 S Mr. 0 0 False
715 716 0 3 Soholt, Mr. Peter Andreas Lauritz Andersen male 19 0 0 348124 7.6500 F G73 S Mr. 0 0 False
751 752 1 3 Moor, Master. Meier male 6 0 1 392096 12.4750 E121 S Master 0 1 True
776 777 0 3 Tobin, Mr. Roger male NaN 0 0 383121 7.7500 F38 Q Mr. 0 0 False
823 824 1 3 Moor, Mrs. (Beila) female 27 0 1 392096 12.4750 E121 S Mrs. 0 1 True

In [227]:
the_type = 2
death_counts = pd.crosstab([titanic[titanic['Pclass']==the_type]["Embarked_num"],titanic[titanic['Pclass']==the_type]["title"]], titanic[titanic['Pclass']==the_type]["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])


Out[227]:
<matplotlib.axes._subplots.AxesSubplot at 0x116bf4fd0>

In [ ]:
cols_describe = []
for i in titanic.columns:
    cols_describe.append(i,get_my_column_type(titanic[i]),len(titanic[i].unique()))

titles_num = {'Mr.':4, 'Miss.':2,'Mrs.':3,'Master':1}
titanic["title_num"] = 0
test["title_num"] = 0
for title in titles_num:
    titanic.loc[(titanic.Name.str.contains(title))&(titanic.title_num == 0),'title_num'] = titles_num[title]
    test.loc[(test.Name.str.contains(title))&(test.title_num == 0),'title_num'] = titles_num[title]

In [208]:
get_my_column_type(titanic["Sex"]) == str


Out[208]:
True

In [224]:
cols_describe = []
my_dataframes = [titanic,test]
cat_classif = {}
cat_classif["title"] = {'Mr.':4, 'Miss.':2,'Mrs.':3,'Master':1}
for i in titanic.columns:
    cols_describe.append([i,get_my_column_type(titanic[i]),len(titanic[i].dropna().unique())])
for categorie_d in [(x[0],x[2]) for x in cols_describe if x[1] == str and x[2] < 10]:
    categorie = categorie_d[0]
    cat_name = categorie+"_num"
    for df in my_dataframes:
        df[cat_name] = 0
    if categorie in cat_classif:
        my_classif = cat_classif[categorie]
    else:
        my_categories_list = set()
        for df in my_dataframes:
            my_categories_list = my_categories_list.union(set(df[categorie].unique()))
        my_categories_list = sorted(list(my_categories_list))
        cat_classif = {i:my_categories_list.index(i)+1 for i in my_categories_list}
    for class_cat in cat_classif:
        for df in my_dataframes:
            df.loc[(df[categorie] == class_cat )&(df[cat_name] == 0),cat_name] = cat_classif[class_cat]

In [223]:
set(my_categories_list).union({'a'})


Out[223]:
{'Master', 'Miss.', 'Mr.', 'Mrs.', 'a'}

In [225]:
titanic[0:2]


Out[225]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked title on_the_ticket total_relatives in_a_group Sex_num Embarked_num title_num
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S Mr. 0 1 True 2 4 3
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C Mrs. 0 1 True 1 2 4

In [ ]: